In [5]:
import pandas
from matplotlib import pyplot as plt
from matplotlib_venn import venn3, venn2
%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')
cran_release = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata-150420.csv', index_col=None)
In [6]:
R_packages = ('R MASS Matrix base boot class cluster codetools compiler datasets foreign grDevices ' +
'graphics grid lattice methods mgcv nlme nnet parallel rpart ' +
'spatial splines stats stats4 survival tcltk tools translations utils').split(' ')
In [7]:
cran_release = cran_release.sort('mtime').drop_duplicates('package', take_last=False).rename(columns={'package': 'Package'})[['Package', 'mtime']]
In [8]:
data = data.query('Source == "cran" or Source == "github"').sort('Date').drop_duplicates(('Package', 'Source'), take_last=True)[['Package', 'Version', 'Source', 'Date', 'Depends', 'Imports']]
data = data.fillna('')
In [9]:
packages = {}
for idx, row in data.iterrows():
package = packages.setdefault(row['Package'], {})
deps = [x.strip() for x in row['Depends'].split(' ') + row['Imports'].split(' ') if len(x.strip())>0]
package[row['Source']] = [x for x in deps if x not in R_packages]
In [10]:
cran_required = {'github': set(), 'cran': set()}
for name, package in packages.iteritems():
for source, deps in package.iteritems():
for dep in deps:
if packages.get(dep, {}).get('cran', None) is not None:
cran_required[source].add(dep)
In [11]:
venn2((cran_required['github'], cran_required['cran']), ('github', 'cran'))
Out[11]:
In [12]:
required = data.query('Source == "cran"')[['Package', 'Date']]
required = required.merge(cran_release, on='Package', how='left').set_index('Package')
In [13]:
required['GitHub'] = required['CRAN'] = required['GitHubOnly'] = required['CRANOnly'] = required['Both'] = pandas.np.nan
In [14]:
for name in cran_required['github']:
required.loc[name, 'GitHub'] = 1
for name in cran_required['cran']:
required.loc[name, 'CRAN'] = 1
In [15]:
required[:10]
Out[15]:
In [16]:
def __F(row):
if row['GitHub'] == 1 and row['CRAN'] == 1:
row['Both'] = 1
else:
if row['GitHub'] == 1:
row['GitHubOnly'] = 1
elif row['CRAN'] == 1:
row['CRANOnly'] = 1
return row
In [17]:
fields = ['GitHub', 'CRAN', 'GitHubOnly', 'CRANOnly', 'Both']
d = required.apply(__F, axis=1)
d = d[['mtime'] + fields]
d['mtime'] = pandas.to_datetime(d['mtime'])
d = d.set_index('mtime').sort_index()
d.cumsum().fillna(method='pad').plot(figsize=(15,6))
Out[17]:
In [18]:
d['days'] = (pandas.to_datetime('2015-06-01') - d.index).astype('timedelta64[D]')
for field in fields:
d['{}D'.format(field)] = d['days'] * d[field]
In [19]:
d[['{}D'.format(field) for field in fields]].plot(kind='box')
Out[19]:
In [20]:
from scipy.stats import mannwhitneyu
from scipy.stats import norm
from math import sqrt
Ud = {}
zd = {}
pd = {}
for field1 in fields:
for field2 in fields:
d1, d2 = d[field1+'D'].dropna(), d[field2+'D'].dropna()
n1, n2 = len(d1), len(d2)
u, p = mannwhitneyu(d1, d2, use_continuity=False)
Ud.setdefault(field1, {})[field2] = u
zd.setdefault(field1, {})[field2] = (u - (n1 * n2) / 2.0) / sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0)
pd.setdefault(field1, {})[field2] = 2 * norm.cdf(zd[field1][field2])
In [21]:
pandas.DataFrame.from_dict(Ud, orient='index').sort_index().sort_index(axis=1)
Out[21]:
Care: p-value must be compared to a global $\alpha$. Under $\alpha=0.05$, the p-value has to be multiplied by 10 here (because we only look at the lower part of the matrix)
In [22]:
pandas.DataFrame.from_dict(pd, orient='index').sort_index().sort_index(axis=1)
Out[22]:
In [23]:
pandas.DataFrame.from_dict(zd, orient='index').sort_index().sort_index(axis=1)
Out[23]: